Date: Mon Apr 27 16:44:30 2020
Scientist: Ran Yin
Sequencing (Waksman): Dibyendu Kumar
Statistics: Davit Sargsyan
Principal Investigator: Ah-Ng Kong
# Taxonomic Ranks:
# **K**ing **P**hillip **C**an n**O**t **F**ind **G**reen **S**ocks
# * Kingdom
# * Phylum
# * Class
# * Order
# * Family
# * Genus
# * Species
options(stringsAsFactors = FALSE,
scipen = 999)
# # Increase mmemory size to 64 Gb----
# invisible(utils::memory.limit(65536))
# str(knitr::opts_chunk$get())
# # NOTE: the below does not work!
# knitr::opts_chunk$set(echo = FALSE,
# message = FALSE,
# warning = FALSE,
# error = FALSE)
# require(knitr)
# require(kableExtra)
# require(shiny)
require(phyloseq)
Loading required package: phyloseq
require(data.table)
Loading required package: data.table
data.table 1.12.2 using 18 threads (see ?getDTthreads). Latest news: r-datatable.com
require(ggplot2)
Loading required package: ggplot2
require(plotly)
Loading required package: plotly
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
require(DT)
Loading required package: DT
source("source/functions_may2019.R")
November 2018 Batch: Nrf2 KO (-/-) Mice
May 2019 Batch: Wild Type Mice
FastQ files were downloaded from Dr. Kumar’s DropBox. A total of 60 files (2 per sample, pair-ended) and 2 metadata files were downloaded.
Data processing scripts (nrf2ubiome_dada2_nov2018_v1.Rmd and nrf2ubiome_dada2_may2019_v1.Rmd) were developed using DADA2 Pipeline Tutorial (1.12) with tips and tricks from the University of Maryland Shool of Medicine Institute for Genome Sciences (IGS) Microbiome Analysis Workshop (April 8-11, 2019). The oresults of the DADA2 scripts (data_nov2018/ps_nov2018.RData and data_may2019/ps_may2019.RData) are explored in this document.
# Load data----
# Counts
load("data_nov2018/ps_nov2018.RData")
ps_nov2018 <- copy(ps)
rm(ps)
# Remove "Undetermined" sample
ps_nov2018 <- subset_samples(ps_nov2018,
Name != "Undetermined_S0")
load("data_may2019/ps_may2019.RData")
gc(verbose = FALSE)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4012301 214.3 7171411 383.0 6600597 352.6
Vcells 7829899 59.8 14786712 112.9 12233664 93.4
# Taxonomy (use the same one for both batches!)
load("data_may2019/taxa.RData")
taxa <- data.table(seq16s = rownames(taxa),
taxa)
# Samples
meta18 <- ps_nov2018@sam_data
datatable(meta18,
options = list(pageLength = nrow(meta18)),
caption = "Nrf2 KO (Nov 18) Meta Data")
meta19 <- ps_may2019@sam_data
datatable(meta19,
options = list(pageLength = nrow(meta19)),
caption = "WT (May 19) Meta Data")
pss <- merge_phyloseq(ps_nov2018,
ps_may2019)
tmp <- pss@sam_data
# Diet
meta <- data.table(Sample = rownames(pss@sam_data),
Diet = tmp$Diet)
meta$Diet <- gsub(x = meta$Diet,
pattern = " Control",
replacement = "")
meta$Diet[!is.na(tmp$TREATMENT)] <- tmp$TREATMENT[!is.na(tmp$TREATMENT)]
meta$Diet <- gsub(x = meta$Diet,
pattern = "Control",
replacement = "AIN93M")
meta$Diet[is.na(meta$Diet)] <- "Pooled"
meta$Diet <- factor(meta$Diet,
levels = c("AIN93M",
"PEITC",
"Pooled"))
# Gemotype
meta$Genotype <- "Nrf2 KO"
meta$Genotype[is.na(tmp$Sex)] <- "Wild Type"
meta$Genotype <- factor(meta$Genotype,
levels = c("Wild Type",
"Nrf2 KO"))
# Time
meta$Week <- as.numeric(as.character(tmp$Week)) - 4
meta$Week[meta$Genotype == "Wild Type"] <- as.numeric(gsub(x = tmp$WEEK[meta$Genotype == "Wild Type"],
pattern = "week ",
replacement = ""))
meta$Week <- paste("Week",
meta$Week)
# LAbel correction, as per Ran's email, Tue, Apr 21, 3:29 PM (6 days ago)
meta$Week[meta$Week == "Week 5"] <- "Week 4"
# meta$Week <- factor(meta$Week,
# levels = c("Week 0",
# "Week 1",
# "Week 4",
# "Week 5"))
meta$Week <- factor(meta$Week,
levels = c("Week 0",
"Week 1",
"Week 4"))
# Mouse ID
meta$Mouse_Num <- as.numeric(as.character(tmp$MouseNum))
meta$Mouse_Num[meta$Genotype == "Wild Type"] <- as.numeric(substr(x = tmp$SAMPLE_NAME[meta$Genotype == "Wild Type"],
start = 3,
stop = 3))
# Cage number
meta$Cage <- as.character(tmp$Cage)
meta$Cage[meta$Genotype == "Wild Type"] <- substr(x = tmp$SAMPLE_NAME[meta$Genotype == "Wild Type"],
start = 2,
stop = 2)
meta$Cage <- factor(meta$Cage)
meta <- data.frame(meta)
rownames(meta) <- meta$Sample
meta
# Edit meta data
sample_data(pss) <- meta
pss@sam_data
Sample Data: [69 samples by 6 sample variables]:
dim(pss@otu_table@.Data)
[1] 69 17046
# Remove OTU unmapped to Bacteria
ps0 <- subset_taxa(pss,
Kingdom == "Bacteria")
dim(ps0@otu_table@.Data)
[1] 69 16351
p1 <- ggplot(smpl,
aes(x = Sample,
y = Total,
group =Diet,
fill = Diet)) +
facet_wrap(~ Genotype + Week,
scale = "free") +
geom_bar(stat = "identity",
color = "black") +
scale_x_discrete("") +
scale_y_continuous("Number of Reads") +
scale_fill_grey("Treatment",
start = 0.1,
end = 1,
na.value = "red",
aesthetics = "fill") +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
# axis.ticks.x=element_blank(),
legend.position = "top")
tiff(filename = "tmp/seq_depth_nov2018_may2019.tiff",
height =6,
width = 8,
units = "in",
res = 600,
compression = "lzw+p")
print(p1)
graphics.off()
print(p1)
t2 <- data.table(table(tax_table(ps0)[, "Phylum"],
exclude = NULL))
t2$V1[is.na(t2$V1)] <- "Unknown"
setorder(t2, -N)
t2[, pct := N/sum(N)]
setorder(t2, -N)
colnames(t2) <- c("Phylum",
"Number of OTUs",
"Percent of OTUs")
datatable(t2,
rownames = FALSE,
caption = "Number of Bacterial OTUs by Phylum",
class = "cell-border stripe",
options = list(search = FALSE,
pageLength = nrow(t2))) %>%
formatCurrency(columns = 2,
currency = "",
mark = ",",
digits = 0) %>%
formatPercentage(columns = 3,
digits = 2)
ps1 <- subset_taxa(ps0,
!is.na(Phylum))
dim(ps1@otu_table@.Data)
[1] 69 15919
Remove:
1. Unmapped OTUs (“Unknown”).
2. Cyanobacteria: aerobic, photosynthesizing bacteria that probably got into the sample through food.
NOTE: Chloroflexi might be ok.
ps0 <- subset_taxa(ps0,
(!(Phylum %in% c("Unknown",
"Cyanobacteria"))))
Shannon index (aka Shannon enthrophy) is calculated as:
H’ = -sum(1 to R)p(i)ln(p(i)) When there is exactly 1 type of data (e.g. a single species in the sample), H’=0. The opposite scenario is when there are R>1 species present in the sample in the exact same amounts and H’=ln(R).
Shannon’s diversity index was calculated for each sample and ploted over time.
shannon.ndx <- estimate_richness(ps0,
measures = "Shannon")
shannon.ndx <- data.table(Sample = rownames(shannon.ndx),
shannon.ndx)
smpl <- merge(smpl,
shannon.ndx,
by = "Sample")
p1 <- ggplot(smpl,
aes(x = Total,
y = Shannon,
fill = Genotype,
shape = Week)) +
geom_point(size = 2) +
scale_shape_manual(breaks = unique(smpl$Week),
values = 21:24)
tiff(filename = "tmp/shannon_vs_depth_nov18_may19.tiff",
height = 5,
width = 6,
units = "in",
res = 600,
compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)
Even though estimate_richness function does not adjust for the sequencing depth, there is no correlation between the index and the sample’s sequecing depth. Proceed with the comparison.
p1 <- plot_richness(ps0,
x = "Week",
measures = "Shannon") +
facet_wrap(~ Genotype,
scale = "free_x") +
geom_line(aes(group = paste0(Diet,
Cage,
Mouse_Num)),
color = "black") +
geom_point(aes(fill = Diet),
shape = 21,
size = 3,
color = "black") +
scale_x_discrete("") +
theme(axis.text.x = element_text(angle = 30,
hjust = 1,
vjust = 1))
ggplotly(p = p1,
tooltip = c("Mouse_Num",
"value"))
p1 <- p1 +
scale_fill_discrete("") +
theme(legend.position = "top")
tiff(filename = "tmp/shannon_nov18_may19.tiff",
height = 5,
width = 8,
units = "in",
res = 600,
compression = "lzw+p")
print(p1)
graphics.off()
The plot above suggests that the largest differences in alpha diversity (as measured by Shannon’s index) are in genotype. THis was also confirmed in the 3rd study (September 2019 batch)
# Average shannon index by treatment group
tmp <- data.table(copy(smpl))
tmp[, mu := mean(Shannon),
by = list(Diet,
Genotype,
Week)]
tmp[, sem := sd(Shannon)/sqrt(.N),
by = list(Diet,
Genotype,
Week)]
tmp <- unique(tmp[, c("Diet",
"Genotype",
"Week",
"mu",
"sem")])
p1 <- ggplot(tmp,
aes(x = Week,
y = mu,
ymin = mu - sem,
ymax = mu + sem,
fill = Diet,
group = Diet)) +
facet_wrap(~ Genotype,
scale = "free_x") +
geom_errorbar(position = position_dodge(0.3),
width = 0.4) +
geom_line(position = position_dodge(0.3)) +
geom_point(size = 3,
shape = 21,
position = position_dodge(0.3)) +
scale_x_discrete("") +
scale_y_continuous("Shannon Index") +
scale_fill_grey("Treatment",
start = 0,
end = 1,
na.value = "red",
aesthetics = "fill") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# panel.border = element_blank(),
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
axis.ticks.x=element_blank(),
legend.position = "top")
tiff(filename = "tmp/avg_shannon_nov18_may19.tiff",
height = 4,
width = 6,
units = "in",
res = 600,
compression = "lzw+p")
print(p1)
graphics.off()
print(p1)
NOTE: cannot test diet effect because of unassigend pooled samples in the KO mice. For the same reason, cannot use mixed effects model on the whole data set. At best, testing genotype and time trend by assuming Week 4 in WT = Week 5 in KO.
smpl$Timepoint <- as.numeric(smpl$Week)
smpl$Timepoint[smpl$Timepoint == 4] <- 3
m1 <- lm(Shannon ~ Timepoint*Genotype,
data = smpl)
summary(m1)
Call:
lm(formula = Shannon ~ Timepoint * Genotype, data = smpl)
Residuals:
Min 1Q Median 3Q Max
-0.45931 -0.09344 0.02849 0.08560 0.34790
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.55295 0.08207 79.842 <0.0000000000000002 ***
Timepoint 0.04053 0.03799 1.067 0.2900
GenotypeNrf2 KO 0.35620 0.13511 2.636 0.0105 *
Timepoint:GenotypeNrf2 KO 0.09221 0.05778 1.596 0.1154
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1699 on 65 degrees of freedom
Multiple R-squared: 0.7687, Adjusted R-squared: 0.7581
F-statistic: 72.02 on 3 and 65 DF, p-value: < 0.00000000000000022
No significant time trend; significantly higher alpha diversity in the Nrf2 KO mice. This is possibly a batch effect but we observed same trend in Study 3 (Crabbery and PEITC with both genotypes in the same study).
counts_p <- counts_by_tax_rank(dt1 = otu,
aggr_by = "Phylum")
fb <- t(counts_p[Phylum %in% c("Bacteroidetes",
"Firmicutes"), -1])
fb <- data.table(Sample = rownames(fb),
Bacteroidetes = fb[, 1],
Firmicutes = fb[, 2])
fb <- data.table(merge(meta,
fb,
by = "Sample"))
lims <- log2(range(c(fb$Bacteroidetes,
fb$Firmicutes)))
p1 <- ggplot(fb,
aes(x = log2(Bacteroidetes),
y = log2(Firmicutes),
fill = Genotype)) +
geom_point(size = 2,
color = "black",
shape = 21) +
geom_abline(slope = 1,
intercept = 0,
linetype = "dashed") +
scale_x_continuous(limits = lims) +
scale_y_continuous(limits = lims) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
p2 <- ggplot(fb,
aes(x = log2(Bacteroidetes),
y = log2(Firmicutes),
fill = Week)) +
geom_point(size = 2,
color = "black",
shape = 21) +
geom_abline(slope = 1,
intercept = 0,
linetype = "dashed") +
scale_x_continuous(limits = lims) +
scale_y_continuous(limits = lims) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
p3 <- ggplot(fb,
aes(x = log2(Bacteroidetes),
y = log2(Firmicutes),
fill = Diet)) +
geom_point(size = 2,
color = "black",
shape = 21) +
geom_abline(slope = 1,
intercept = 0,
linetype = "dashed") +
scale_x_continuous(limits = lims) +
scale_y_continuous(limits = lims) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
p4 <- ggplot(fb,
aes(x = Week,
y = Bacteroidetes/Firmicutes,
fill = Diet,
group = paste0(Diet,
Cage,
Mouse_Num))) +
facet_grid(~ Genotype,
scale = "free_x") +
geom_hline(yintercept = 1,
linetype = "dashed") +
geom_line(position = position_dodge(0.3)) +
geom_point(size = 2,
color = "black",
shape = 21,
position = position_dodge(0.3)) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# panel.border = element_blank(),
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
# axis.ticks.x=element_blank(),
legend.position = "top")
tiff(filename = "tmp/bact_vs_firm_nov18_may19.tiff",
height = 6,
width =8,
units = "in",
res = 600,
compression = "lzw+p")
gridExtra::grid.arrange(p1, p2, p3, p4)
graphics.off()
gridExtra::grid.arrange(p1, p2, p3, p4)
fb[, B_F := Bacteroidetes/Firmicutes]
fb[, log2_B_F := log2(B_F)]
m1 <- lm(log2_B_F ~ 0 + Week*Diet,
data = droplevels(fb[Genotype == "Wild Type", ]))
s1 <- summary(m1)
ci1 <- confint(m1)
t1 <- data.table(Term = rownames(s1$coefficients),
Ratio = round(2^s1$coefficients[, 1], 3),
`95% C.I.L.L.` = round(2^ci1[, 1], 3),
`95% C.I.U.L.` = round(2^ci1[, 2], 3),
`p-Value` = round(s1$coefficients[, 4], 3),
Sign = "")
t1$Sign[t1$`p-Value` < 0.05] <- "*"
t1$Sign[t1$`p-Value` < 0.01] <- "**"
t1$`p-Value`[t1$`p-Value` < 0.001] <- "<0.001"
datatable(t1,
rownames = FALSE,
class = "cell-border stripe",
caption = "Wild Type")
m2 <- lm(log2_B_F ~ 0 + Week*Diet,
data = droplevels(fb[Genotype == "Nrf2 KO", ]))
s2 <- summary(m2)
ci2 <- confint(m2)
ci2 <- ci2[!is.na(ci2[, 1]),]
t2 <- data.table(Term = rownames(s2$coefficients),
Ratio = round(2^s2$coefficients[, 1], 3),
`95% C.I.L.L.` = round(2^ci2[, 1], 3),
`95% C.I.U.L.` = round(2^ci2[, 2], 3),
`p-Value` = round(s2$coefficients[, 4], 3),
Sign = "")
t2$Sign[t2$`p-Value` < 0.05] <- "*"
t2$Sign[t2$`p-Value` < 0.01] <- "**"
t2$`p-Value`[t2$`p-Value` < 0.001] <- "<0.001"
datatable(t2,
rownames = FALSE,
class = "cell-border stripe",
caption = "Nrf2 KO")
fb[, mu := mean(Bacteroidetes/Firmicutes),
by = c("Diet",
"Genotype",
"Week")]
fb[, sem := sd(Bacteroidetes/Firmicutes)/sqrt(.N),
by = c("Diet",
"Genotype",
"Week")]
mufb <- unique(fb[, c("Diet",
"Genotype",
"Week",
"mu",
"sem")])
p5 <- ggplot(mufb,
aes(x = Week,
y = mu,
ymin = mu - sem,
ymax = mu + sem,
fill = Diet,
group = Diet)) +
facet_wrap(~ Genotype,
scale = "free_x") +
geom_hline(yintercept = 1,
linetype = "dashed") +
geom_errorbar(position = position_dodge(0.3),
width = 0.4) +
geom_line(position = position_dodge(0.3)) +
geom_point(size = 3,
shape = 21,
position = position_dodge(0.3)) +
scale_x_discrete("") +
scale_y_continuous("Bacteroidetes/Firmicutes") +
scale_fill_grey("Treatment",
start = 0,
end = 1,
na.value = "red",
aesthetics = "fill") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# panel.border = element_blank(),
axis.title.x = element_blank(),
# axis.text.x = element_blank(),
axis.text.x = element_text(angle = 45,
hjust = 1),
# axis.ticks.x=element_blank(),
legend.position = "top")
tiff(filename = "tmp/avg_bact_firm_nov18_may19.tiff",
height = 4,
width = 6,
units = "in",
res = 600,
compression = "lzw+p")
print(p5)
graphics.off()
print(p5)
mufb[, est := paste0(round(mu, 2),
"(",
round(sem, 2),
")")]
t1 <- dcast.data.table(mufb,
Genotype + Diet ~ Week,
value.var = "est")
datatable(t1,
rownames = FALSE,
class = "cell-border stripe",
caption = "Average Ratio and SD of Bacteroidetes and Firmicutes",
options = list(search = FALSE,
pageLength = nrow(t1)))
tiff(filename = "tmp/fig7_alt_bact_vs_firm_nov18_may19.tiff",
height = 6,
width =8,
units = "in",
res = 600,
compression = "lzw+p")
gridExtra::grid.arrange(p1, p2, p3, p5)
graphics.off()
gridExtra::grid.arrange(p1, p2, p3, p5)
otu <- data.table(ps0@tax_table@.Data,
t(ps0@otu_table@.Data))
# Remove Species mapping'
otu$Species <- NULL
dim(otu)
[1] 16221 75
dt_pca <- t(ra_p[, 2:ncol(ra_p)])
colnames(dt_pca) <- ra_p$Phylum
dt_pca_p <- data.table(Sample = rownames(dt_pca),
dt_pca)
dt_pca_p <- merge(smpl,
dt_pca_p,
by = "Sample")
# Keep only the phylum with non-zero counts
tmp <- dt_pca_p[, 10:ncol(dt_pca_p)]
keep_p <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_p]
# m1 <- prcomp(dt_pca,
# center = TRUE,
# scale. = TRUE)
# m1 <- prcomp(dt_pca,
# center = FALSE,
# scale. = FALSE)
m1 <- prcomp(dt_pca,
center = TRUE,
scale. = FALSE)
summary(m1)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
Standard deviation 0.1016 0.07982 0.05398 0.03765 0.01000 0.008812 0.002823 0.0004897 0.0001899 0.00008382
Proportion of Variance 0.4864 0.30051 0.13743 0.06688 0.00472 0.003660 0.000380 0.0000100 0.0000000 0.00000000
Cumulative Proportion 0.4864 0.78693 0.92435 0.99123 0.99595 0.999610 0.999990 1.0000000 1.0000000 1.00000000
PC11 PC12 PC13
Standard deviation 0.00003105 0.0000258 0.00002073
Proportion of Variance 0.00000000 0.0000000 0.00000000
Cumulative Proportion 1.00000000 1.0000000 1.00000000
# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2
# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)
dt.scr <- merge(smpl,
dt.scr,
by = "Sample")
dt.scr
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot
dt.load <- melt.data.table(dt.rot,
id.vars = "feat",
measure.vars = 1:2,
variable.name = "pc",
value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
aes(x = feat,
y = loading)) +
facet_wrap(~ pc,
nrow = 2) +
geom_bar(stat = "identity") +
ggtitle("PC Loadings") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45,
hjust = 1))
p0
tiff(filename = "tmp/pc.1.2_loadings_phylum.tiff",
height = 5,
width = 6,
units = 'in',
res = 300,
compression = "lzw+p")
print(p0)
graphics.off()
print(p0)
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2],
sprintf('(%0.1f%% explained var.)',
100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
[1] "PC1 (48.6% explained var.)" "PC2 (30.1% explained var.)"
cntr <- data.table(aggregate(x = dt.scr$PC1,
by = list(dt.scr$Genotype),
FUN = "mean"),
aggregate(x = dt.scr$PC2,
by = list(dt.scr$Genotype),
FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
"PC1",
"PC2")
# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)
# Select top 5
dt.rot <- dt.rot[1:5, ]
# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
aes(x = PC1,
y = PC2)) +
# coord_equal() +
geom_point(data = dt.scr,
aes(fill = Genotype,
shape = factor(Timepoint)),
size = 3,
alpha = 0.5) +
geom_segment(aes(x = 0,
y = 0,
xend = 0.2*PC1,
yend = 0.2*PC2),
arrow = arrow(length = unit(1/2, 'picas')),
# size = 1,
color = "black") +
geom_text(aes(x = 0.22*PC1,
y = 0.22*PC2,
label = dt.rot$feat),
# size = 5,
hjust = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[2]) +
scale_fill_manual(name = "Group",
breaks = c("Wild Type",
"Nrf2 KO"),
values = c("red",
"blue")) +
scale_shape_manual(breaks = 1:3,
values = 21:23) +
geom_label(data = cntr,
aes(x = PC1,
y = PC2,
label = Genotype,
colour = Genotype),
alpha = 0.5,
size = 3) +
scale_color_manual(guide = FALSE,
breaks = c("Wild Type",
"Nrf2 KO"),
values = c("red",
"blue")) +
ggtitle("") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
tiff(filename = "tmp/phylum_biplot_grp.tiff",
height = 7,
width = 7,
units = 'in',
res = 300,
compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)
geom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
# Generic biplot
biplot(m1)
dt_pca <- t(ra_c[, 2:ncol(ra_c)])
colnames(dt_pca) <- ra_c$Class
dt_pca_c <- data.table(Sample = rownames(dt_pca),
dt_pca)
dt_pca_c <- merge(smpl,
dt_pca_c,
by = "Sample")
# Keep only the Class with non-zero counts
tmp <- dt_pca_c[, 10:ncol(dt_pca_c)]
keep_c <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_c]
m1 <- prcomp(dt_pca,
center = TRUE,
scale. = FALSE)
summary(m1)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
Standard deviation 0.08999 0.07999 0.0740 0.05046 0.03517 0.02037 0.01466 0.009891 0.008728 0.007865 0.005555
Proportion of Variance 0.32837 0.25949 0.2220 0.10327 0.05015 0.01682 0.00872 0.003970 0.003090 0.002510 0.001250
Cumulative Proportion 0.32837 0.58786 0.8099 0.91318 0.96333 0.98016 0.98887 0.992840 0.995930 0.998440 0.999690
PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20
Standard deviation 0.00265 0.000615 0.0004527 0.0001791 0.00008448 0.0000719 0.0000276 0.00002248 0.00002123
Proportion of Variance 0.00028 0.000020 0.0000100 0.0000000 0.00000000 0.0000000 0.0000000 0.00000000 0.00000000
Cumulative Proportion 0.99997 0.999990 1.0000000 1.0000000 1.00000000 1.0000000 1.0000000 1.00000000 1.00000000
PC21 PC22 PC23 PC24
Standard deviation 0.00001378 0.00001094 0.00001006 0.000009136
Proportion of Variance 0.00000000 0.00000000 0.00000000 0.000000000
Cumulative Proportion 1.00000000 1.00000000 1.00000000 1.000000000
# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2
# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)
dt.scr <- merge(smpl,
dt.scr,
by = "Sample")
dt.scr
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot
dt.load <- melt.data.table(dt.rot,
id.vars = "feat",
measure.vars = 1:2,
variable.name = "pc",
value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
aes(x = feat,
y = loading)) +
facet_wrap(~ pc,
nrow = 2) +
geom_bar(stat = "identity") +
ggtitle("PC Loadings") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45,
hjust = 1))
p0
tiff(filename = "tmp/pc.1.2_loadings_Class.tiff",
height = 5,
width = 6,
units = 'in',
res = 300,
compression = "lzw+p")
print(p0)
graphics.off()
print(p0)
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2],
sprintf('(%0.1f%% explained var.)',
100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
[1] "PC1 (32.8% explained var.)" "PC2 (25.9% explained var.)"
cntr <- data.table(aggregate(x = dt.scr$PC1,
by = list(dt.scr$Genotype),
FUN = "mean"),
aggregate(x = dt.scr$PC2,
by = list(dt.scr$Genotype),
FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
"PC1",
"PC2")
# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)
# Select top 8
dt.rot <- dt.rot[1:8, ]
# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
aes(x = PC1,
y = PC2)) +
geom_point(data = dt.scr,
aes(fill = Genotype,
shape = factor(Timepoint)),
size = 3,
alpha = 0.5) +
geom_segment(aes(x = 0,
y = 0,
xend = 0.2*PC1,
yend = 0.2*PC2),
arrow = arrow(length = unit(1/2, 'picas')),
# size = 1,
color = "black") +
geom_text(aes(x = 0.22*PC1,
y = 0.22*PC2,
label = dt.rot$feat),
# size = 5,
hjust = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[2]) +
scale_fill_manual(name = "Group",
breaks = c("Wild Type",
"Nrf2 KO"),
values = c("red",
"blue")) +
scale_shape_manual(breaks = 1:3,
values = 21:23) +
geom_label(data = cntr,
aes(x = PC1,
y = PC2,
label = Genotype,
colour = Genotype),
alpha = 0.5,
size = 3) +
scale_color_manual(guide = FALSE,
breaks = c("Wild Type",
"Nrf2 KO"),
values = c("red",
"blue")) +
ggtitle("") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
tiff(filename = "tmp/class_biplot_gen.tiff",
height = 7,
width = 7,
units = 'in',
res = 300,
compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)
geom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
# Generic biplot
biplot(m1)
dt_pca <- t(ra_g[, 2:ncol(ra_g)])
colnames(dt_pca) <- ra_g$Genus
dt_pca_g <- data.table(Sample = rownames(dt_pca),
dt_pca)
dt_pca_g <- merge(smpl,
dt_pca_g,
by = "Sample")
# Keep only the Genus with non-zero counts
tmp <- dt_pca_g[, 10:ncol(dt_pca_g)]
keep_g <- colnames(tmp)[colSums(tmp) > 0]
dt_pca <- dt_pca[, keep_g]
m1 <- prcomp(dt_pca,
center = TRUE,
scale. = FALSE)
summary(m1)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
Standard deviation 0.1561 0.07993 0.07342 0.06674 0.05499 0.04364 0.03276 0.02879 0.02722 0.02502 0.02370
Proportion of Variance 0.4792 0.12568 0.10605 0.08761 0.05948 0.03746 0.02112 0.01631 0.01457 0.01232 0.01105
Cumulative Proportion 0.4792 0.60492 0.71097 0.79858 0.85806 0.89552 0.91663 0.93294 0.94752 0.95983 0.97088
PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21
Standard deviation 0.01666 0.01491 0.01453 0.01219 0.01067 0.01000 0.009643 0.008103 0.006794 0.006453
Proportion of Variance 0.00546 0.00437 0.00415 0.00292 0.00224 0.00197 0.001830 0.001290 0.000910 0.000820
Cumulative Proportion 0.97634 0.98072 0.98487 0.98779 0.99003 0.99200 0.993830 0.995120 0.996030 0.996850
PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31
Standard deviation 0.005544 0.005349 0.004051 0.003882 0.00355 0.003155 0.002829 0.00269 0.002344 0.002183
Proportion of Variance 0.000600 0.000560 0.000320 0.000300 0.00025 0.000200 0.000160 0.00014 0.000110 0.000090
Cumulative Proportion 0.997450 0.998010 0.998340 0.998630 0.99888 0.999080 0.999230 0.99938 0.999480 0.999580
PC32 PC33 PC34 PC35 PC36 PC37 PC38 PC39 PC40 PC41
Standard deviation 0.001933 0.001904 0.00180 0.001411 0.00123 0.001175 0.001096 0.0009692 0.0008116 0.0007119
Proportion of Variance 0.000070 0.000070 0.00006 0.000040 0.00003 0.000030 0.000020 0.0000200 0.0000100 0.0000100
Cumulative Proportion 0.999650 0.999720 0.99979 0.999830 0.99986 0.999880 0.999910 0.9999200 0.9999400 0.9999500
PC42 PC43 PC44 PC45 PC46 PC47 PC48 PC49 PC50
Standard deviation 0.0006493 0.0005688 0.0005378 0.0004891 0.0004791 0.0004499 0.0003972 0.0003618 0.0003308
Proportion of Variance 0.0000100 0.0000100 0.0000100 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Cumulative Proportion 0.9999600 0.9999600 0.9999700 0.9999700 0.9999800 0.9999800 0.9999800 0.9999900 0.9999900
PC51 PC52 PC53 PC54 PC55 PC56 PC57 PC58 PC59
Standard deviation 0.0003232 0.0002766 0.0002682 0.0002355 0.0002136 0.0002006 0.0001936 0.0001866 0.0001584
Proportion of Variance 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Cumulative Proportion 0.9999900 0.9999900 0.9999900 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
PC60 PC61 PC62 PC63 PC64 PC65 PC66 PC67
Standard deviation 0.0001441 0.0001155 0.0001064 0.00009134 0.00008498 0.00006457 0.00005041 0.00004638
Proportion of Variance 0.0000000 0.0000000 0.0000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
Cumulative Proportion 1.0000000 1.0000000 1.0000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000
PC68 PC69
Standard deviation 0.00003256 0.00000000000000001228
Proportion of Variance 0.00000000 0.00000000000000000000
Cumulative Proportion 1.00000000 1.00000000000000000000
# Select PC-s to pliot (PC1 & PC2)
choices <- 1:2
# Add meta data
dt.scr <- data.table(m1$x[, choices])
dt.scr$Sample <- rownames(m1$x)
dt.scr <- merge(smpl,
dt.scr,
by = "Sample")
dt.scr
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
dt.rot
dt.load <- melt.data.table(dt.rot,
id.vars = "feat",
measure.vars = 1:2,
variable.name = "pc",
value.name = "loading")
dt.load$feat <- factor(dt.load$feat,
levels = unique(dt.load$feat))
# Plot loadings
p0 <- ggplot(data = dt.load,
aes(x = feat,
y = loading)) +
facet_wrap(~ pc,
nrow = 2) +
geom_bar(stat = "identity") +
ggtitle("PC Loadings") +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45,
hjust = 1))
p0
tiff(filename = "tmp/pc.1.2_loadings_genus.tiff",
height = 5,
width = 6,
units = 'in',
res = 300,
compression = "lzw+p")
print(p0)
graphics.off()
print(p0)
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[1:2],
sprintf('(%0.1f%% explained var.)',
100*m1$sdev[choices]^2/sum(m1$sdev^2)))
u.axis.labs
[1] "PC1 (47.9% explained var.)" "PC2 (12.6% explained var.)"
cntr <- data.table(aggregate(x = dt.scr$PC1,
by = list(dt.scr$Genotype),
FUN = "mean"),
aggregate(x = dt.scr$PC2,
by = list(dt.scr$Genotype),
FUN = "mean")$x)
colnames(cntr) <- c("Genotype",
"PC1",
"PC2")
# Based on Figure p0, keep only a few variables with high loadings in PC1 and PC2----
dt.rot[, rating:= (PC1)^2 + (PC2)^2]
setorder(dt.rot, -rating)
# Select top 9
dt.rot <- dt.rot[1:9, ]
# var.keep.ndx <- which(dt.rot$feat %in% c(...))
# Or select all
# var.keep.ndx <- 3:ncol(dt1)
# Use dt.rot[var.keep.ndx,] and dt.rot$feat[var.keep.ndx]
p1 <- ggplot(data = dt.rot,
aes(x = PC1,
y = PC2)) +
geom_point(data = dt.scr,
aes(fill = Genotype,
shape = factor(Timepoint)),
size = 3,
alpha = 0.5) +
geom_segment(aes(x = 0,
y = 0,
xend = 0.2*PC1,
yend = 0.2*PC2),
arrow = arrow(length = unit(1/2, 'picas')),
# size = 1,
color = "black") +
geom_text(aes(x = 0.22*PC1,
y = 0.22*PC2,
label = dt.rot$feat),
# size = 5,
hjust = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[2]) +
scale_fill_manual(name = "Group",
breaks = c("Wild Type",
"Nrf2 KO"),
values = c("red",
"blue")) +
scale_shape_manual(breaks = 1:3,
values = 21:23) +
geom_label(data = cntr,
aes(x = PC1,
y = PC2,
label = Genotype,
colour = Genotype),
alpha = 0.5,
size = 3) +
scale_color_manual(guide = FALSE,
breaks = c("Wild Type",
"Nrf2 KO"),
values = c("red",
"blue")) +
ggtitle("") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
tiff(filename = "tmp/genus_biplot_gen.tiff",
height = 7,
width = 7,
units = 'in',
res = 300,
compression = "lzw+p")
print(p1)
graphics.off()
ggplotly(p1)
geom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
# Generic biplot
biplot(m1)
cntr <- data.table(aggregate(x = dt.scr$PC1,
by = list(dt.scr$Diet),
FUN = "mean"),
aggregate(x = dt.scr$PC2,
by = list(dt.scr$Diet),
FUN = "mean")$x)
colnames(cntr) <- c("Diet",
"PC1",
"PC2")
p2 <- ggplot(data = dt.rot,
aes(x = PC1,
y = PC2)) +
geom_point(data = dt.scr,
aes(fill = Diet,
shape = factor(Timepoint)),
size = 3,
alpha = 0.5) +
geom_segment(aes(x = 0,
y = 0,
xend = 0.2*PC1,
yend = 0.2*PC2),
arrow = arrow(length = unit(1/2, 'picas')),
# size = 1,
color = "black") +
geom_text(aes(x = 0.22*PC1,
y = 0.22*PC2,
label = dt.rot$feat),
# size = 5,
hjust = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[2]) +
scale_fill_manual(name = "Group",
breaks = c("AIN93M",
"PEITC",
"Pooled"),
values = c("red",
"blue",
"black")) +
scale_shape_manual(breaks = 1:3,
values = 21:23) +
geom_label(data = cntr,
aes(x = PC1,
y = PC2,
label = Diet,
colour = Diet),
alpha = 0.5,
size = 3) +
scale_color_manual(guide = FALSE,
breaks = c("AIN93M",
"PEITC",
"Pooled"),
values = c("red",
"blue",
"black")) +
ggtitle("") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none")
tiff(filename = "tmp/genus_biplot_diet.tiff",
height = 7,
width = 7,
units = 'in',
res = 300,
compression = "lzw+p")
print(p2)
graphics.off()
ggplotly(p2)
geom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issuesgeom_GeomLabel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server 7.5 (Maipo)
Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] DT_0.6 plotly_4.9.0 ggplot2_3.2.0 data.table_1.12.2 phyloseq_1.26.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 ape_5.3 lattice_0.20-35 tidyr_0.8.3 Biostrings_2.50.2
[6] assertthat_0.2.1 digest_0.6.19 foreach_1.4.4 mime_0.6 R6_2.4.0
[11] plyr_1.8.4 stats4_3.5.0 httr_1.4.0 pillar_1.4.0 zlibbioc_1.28.0
[16] rlang_0.4.0 lazyeval_0.2.2 rstudioapi_0.10 vegan_2.5-5 S4Vectors_0.20.1
[21] Matrix_1.2-14 labeling_0.3 splines_3.5.0 stringr_1.4.0 htmlwidgets_1.3
[26] igraph_1.2.4.1 munsell_0.5.0 shiny_1.3.2 compiler_3.5.0 httpuv_1.5.1
[31] xfun_0.7 pkgconfig_2.0.2 BiocGenerics_0.28.0 multtest_2.38.0 mgcv_1.8-23
[36] htmltools_0.3.6 biomformat_1.10.1 tidyselect_0.2.5 gridExtra_2.3 tibble_2.1.3
[41] IRanges_2.16.0 codetools_0.2-15 permute_0.9-5 viridisLite_0.3.0 crayon_1.3.4
[46] dplyr_0.8.1 withr_2.1.2 later_0.8.0 MASS_7.3-49 grid_3.5.0
[51] nlme_3.1-137 jsonlite_1.6 xtable_1.8-4 gtable_0.3.0 lifecycle_0.1.0
[56] magrittr_1.5 scales_1.1.0 stringi_1.4.3 farver_2.0.3 XVector_0.22.0
[61] reshape2_1.4.3 promises_1.0.1 Rhdf5lib_1.4.3 iterators_1.0.10 tools_3.5.0
[66] ade4_1.7-13 Biobase_2.42.0 glue_1.3.1 purrr_0.3.2 crosstalk_1.0.0
[71] parallel_3.5.0 survival_2.41-3 yaml_2.2.0 colorspace_1.4-1 rhdf5_2.26.2
[76] cluster_2.0.7-1 knitr_1.23